In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

np.random.seed(42)
In [2]:
df = pd.read_csv('classroom_actions.csv')
df.head()
Out[2]:
timestamp id group total_days completed
0 2015-08-10 17:06:01.032740 610019 experiment 97 True
1 2015-08-10 17:15:28.950975 690224 control 75 False
2 2015-08-10 17:34:40.920384 564994 experiment 128 True
3 2015-08-10 17:50:39.847374 849588 experiment 66 False
4 2015-08-10 19:10:40.650599 849826 experiment 34 False
In [5]:
# Create dataframe with all control records
control_df = df.query('group == "control"')

# Compute completion rate
control_cr = control_df.completed.mean()
control_cr2 = len(control_df.query('completed==True')) / len(control_df)
# Display completion rate
control_cr, control_cr2
Out[5]:
(0.37199519230769229, 0.3719951923076923)
In [6]:
# Create dataframe with all experiment records
experiment_df =  df.query('group=="experiment"')

# Compute completion rate
experiment_ctr = experiment_df.completed.mean()

# Display completion rate
experiment_ctr
Out[6]:
0.39353348729792148
In [8]:
# Compute observed difference in completion rates
obs_diff = experiment_ctr - control_cr

# Display observed difference in completion rates
obs_diff
Out[8]:
0.02153829499022919
In [9]:
# Create sampling distribution for difference in completion rates
# with boostrapping
diffs = []
for _ in range(10000):
    sample = df.sample(len(df), replace=True) 
    sample_control_cr = sample.query('group=="control"').completed.mean() 
    sample_experiment_cr = sample.query('group=="experiment"').completed.mean()
    diffs.append(sample_experiment_cr - sample_control_cr) 
In [10]:
# convert to numpy array
diffs = np.array(diffs)
In [11]:
# plot distribution
plt.hist(diffs) 
Out[11]:
(array([   10.,    85.,   449.,  1353.,  2457.,  2778.,  1837.,   776.,
          223.,    32.]),
 array([-0.0381512 , -0.02679438, -0.01543755, -0.00408072,  0.0072761 ,
         0.01863293,  0.02998976,  0.04134658,  0.05270341,  0.06406024,
         0.07541706]),
 <a list of 10 Patch objects>)
In [12]:
# create distribution under the null hypothesis
null_vals = np.random.normal(0, diffs.std(), diffs.shape[0])
In [13]:
# plot null distribution
plt.hist(null_vals) 

# plot line for observed statistic
plt.axvline(diffs.mean(), color='r')
Out[13]:
<matplotlib.lines.Line2D at 0x7fc1d618b860>
In [14]:
# compute p value
(null_vals > diffs.mean()).mean()
Out[14]:
0.0872
In [ ]: